In [1]:
%matplotlib notebook
import pymc3 as pm
import arviz as az
import theano.tensor as tt
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import graphviz
In [2]:
pm.__version__
Out[2]:
'3.9.3'
In [18]:
RANDOM_SEED = 8927 # same as used by pymc3 devs
In [3]:
df_data = pd.read_csv('data/Trolley.csv', sep=';')
df_data.head()
Out[3]:
case response order id age male edu action intention contact story action2
0 cfaqu 4 2 96;434 14 0 Middle School 0 0 1 aqu 1
1 cfbur 3 31 96;434 14 0 Middle School 0 0 1 bur 1
2 cfrub 4 16 96;434 14 0 Middle School 0 0 1 rub 1
3 cibox 3 32 96;434 14 0 Middle School 0 1 1 box 1
4 cibur 3 4 96;434 14 0 Middle School 0 1 1 bur 1

Higher response values mean the action is considered more morally permissible.

In [4]:
NUM_RESP_CATEGORIES = len(df_data['response'].unique())
NUM_RESP_CATEGORIES
Out[4]:
7

0 Reproduce Model from Lecture

Calculate Cutpoints directly from the Data

In [5]:
cum_prob_k = np.array([(df_data['response']<=k).sum() for k in range(1, NUM_RESP_CATEGORIES+1)])/len(df_data)
log_cum_odds_k = np.log(cum_prob_k/(1-cum_prob_k))

fig, axes = plt.subplots(1,3,figsize=(8,4),sharex=True)
fig.set_tight_layout(True)
axes = axes.flatten()

axes[0].hist(df_data['response'], bins=np.array(range(NUM_RESP_CATEGORIES+1))+0.5, rwidth=0.5)
axes[0].set_xlabel('response')
axes[0].set_ylabel('counts')

axes[1].plot(range(1,NUM_RESP_CATEGORIES+1), cum_prob_k, '-o')
axes[1].set_xlabel('response')
axes[1].set_ylabel('Cumulative Probability')

axes[2].plot(range(1,NUM_RESP_CATEGORIES+1), log_cum_odds_k, '-o')
axes[2].set_xlabel('response')
axes[2].set_ylabel('Log Cumulative Odds')
Out[5]:
Text(0, 0.5, 'Log Cumulative Odds')

Calculate the Cutpoints with Ordered Logistic Regression

The response needs to start at zero. This is somewhat surprising because the pymc3 docs say that OrderedLogistic is "useful for regression on ordinal data values whose values range from 1 to K." On the other hand, OrderedLogistic inherits from Categorical which has support $x \in [0, 1, 2, \dots K-1]$.

In [6]:
with pm.Model() as model_0:
    cutpoints = pm.Normal('cutpoints', mu=0, sd=1.5,
                          transform=pm.distributions.transforms.ordered,
                          shape=NUM_RESP_CATEGORIES-1,
                          testval = [-2.5, -1.5, -0.5, 0.5, 1.5, 2.5])
    
    # the response needs to start at 0 (see the comment above)
    response_shared = pm.Data('response_obs', df_data['response'] - 1)
                              
    _ = pm.OrderedLogistic('response', eta=0, cutpoints=cutpoints, observed=response_shared)
    
    trace_0 = pm.sample(1000, tune=1000, return_inferencedata=True)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [cutpoints]
100.00% [8000/8000 00:10<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 11 seconds.
In [8]:
az.summary(trace_0, round_to=2)
Out[8]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
cutpoints[0] -1.92 0.03 -1.97 -1.86 0.0 0.0 4294.66 4289.24 4300.18 3067.87 1.0
cutpoints[1] -1.27 0.02 -1.31 -1.22 0.0 0.0 4782.08 4767.02 4794.78 2739.55 1.0
cutpoints[2] -0.72 0.02 -0.76 -0.68 0.0 0.0 4436.92 4421.79 4428.40 3505.47 1.0
cutpoints[3] 0.25 0.02 0.21 0.29 0.0 0.0 4563.07 4549.55 4568.99 3725.35 1.0
cutpoints[4] 0.89 0.02 0.85 0.93 0.0 0.0 4646.29 4646.29 4654.40 3635.74 1.0
cutpoints[5] 1.77 0.03 1.71 1.82 0.0 0.0 4888.28 4888.28 4900.70 3398.64 1.0
In [9]:
# compare to the manually calculated cutpoints
print('pymc3\tmanual\tdifference')
print('--------------------------')
for v1, v2 in zip(trace_0.posterior.cutpoints.mean(axis=(0,1)).values, log_cum_odds_k[:-1]):
    print('{:.2f} \t{:.2f}\t{:.2e}'.format(v1, v2, v1-v2))
pymc3	manual	difference
--------------------------
-1.92 	-1.92	-5.30e-04
-1.27 	-1.27	3.65e-05
-0.72 	-0.72	2.87e-04
0.25 	0.25	-3.97e-05
0.89 	0.89	-1.19e-04
1.77 	1.77	-2.66e-04

1

The DAG looks as follows, where

  • A = Age
  • E = Education
  • R = Response
In [10]:
dag = graphviz.Digraph()
dag.node('A', 'A')
dag.node('E', 'E')
dag.node('R', 'R')
dag.edges(['ER', 'AR', 'AE'])
dag
Out[10]:
%3 A A E E A->E R R A->R E->R

Data Transformations

Encode Eduacation as an Ordered Variable

In [11]:
for v in df_data['edu'].unique():
    print(v)
Middle School
Bachelor's Degree
Some College
Master's Degree
High School Graduate
Graduate Degree
Some High School
Elementary School
In [12]:
edu_to_num = {"Elementary School": 0,
              "Middle School": 1,
              "Some High School": 2,
              "High School Graduate": 3,
              "Some College": 4,
              "Bachelor's Degree": 5,
              "Master's Degree": 6,
              "Graduate Degree": 7
             }
num_to_edu = {edu_to_num[k]: k for k in edu_to_num}
df_data['edu_ord'] = df_data['edu'].apply(lambda x: edu_to_num[x])
In [13]:
df_data['edu'].value_counts()
Out[13]:
Bachelor's Degree       3540
Some College            2460
Master's Degree         1410
Graduate Degree         1050
High School Graduate     870
Some High School         420
Middle School            120
Elementary School         60
Name: edu, dtype: int64
In [14]:
df_data['edu_ord'].value_counts()
Out[14]:
5    3540
4    2460
6    1410
7    1050
3     870
2     420
1     120
0      60
Name: edu_ord, dtype: int64
In [15]:
NUM_EDU_CATEGORIES = len(edu_to_num)

Standardize Age

This is needed for the model to converge.

In [16]:
df_data['age_std'] = (df_data['age'] - df_data['age'].mean())/df_data['age'].std()

1.1 Model Inlcuding Age and Education

This model blocks the backdoor from Education through Age to Response.

Note: for some reason these models don't work with pm.sample_prior_predictive . This is probably related to https://discourse.pymc.io/t/valueerror-probabilities-are-not-non-negative-when-trying-to-sample-prior-predictive/4559 .

In [21]:
with pm.Model() as model_1_1:
    
    # cutpoints
    cutpoints = pm.Normal('cutpoints', mu=0, sd=1.5,
                          transform=pm.distributions.transforms.ordered,
                          shape=NUM_RESP_CATEGORIES-1,
                          testval = log_cum_odds_k[:-1])
    
    # Create shared variables
    action_shared = pm.Data('action', df_data['action'])
    intention_shared = pm.Data('intention', df_data['intention'])
    contact_shared = pm.Data('contact', df_data['contact'])
    age_shared = pm.Data('age', df_data['age_std']) # need to use standardized values
    edu_shared = pm.Data('edu', df_data['edu_ord']) # this is an ordered variable
    
    # the response needs to start at 0 (see the comment above)
    response_shared = pm.Data('response_obs', df_data['response'] - 1)
    
    # Parameters
    bA = pm.Normal('bA', mu=0, sd=0.5)
    bC = pm.Normal('bC', mu=0, sd=0.5)
    bI = pm.Normal('bI', mu=0, sd=0.5)
    bIA = pm.Normal('bIA', mu=0, sd=0.5)
    bIC = pm.Normal('bIC', mu=0, sd=0.5)
    bAge = pm.Normal('bAge', mu=0, sd=0.5)
    bE = pm.Normal('bE', mu=0, sd=0.5)
    
    delta = pm.Dirichlet('delta', np.full(NUM_EDU_CATEGORIES-1,2))
    delta_j = pm.math.concatenate((tt.zeros(1), delta))
    delta_j_cumsum = tt.cumsum(delta_j)
    
    # regression
    BI = bI + bIA*action_shared + bIC*contact_shared
    phi = pm.Deterministic('phi', bA*action_shared + BI*intention_shared + bC*contact_shared \
                           + bAge*age_shared + bE*delta_j_cumsum[edu_shared])
    
    _ = pm.OrderedLogistic('response', eta=phi, cutpoints=cutpoints, observed=response_shared)
    
    # prior_predicitive_1_1 = pm.sample_prior_predictive() # Throws an error. Why?
    trace_1_1 = pm.sample(2000, tune=2000, return_inferencedata=True, random_seed=RANDOM_SEED)
    # posterior_predictive_1_1 = pm.sample_posterior_predictive(trace_1_1)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [delta, bE, bAge, bIC, bIA, bI, bC, bA, cutpoints]
100.00% [16000/16000 09:00<00:00 Sampling 4 chains, 1,467 divergences]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 541 seconds.
There were 1454 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.4957531518870651, but should be close to 0.8. Try to increase the number of tuning steps.
There were 13 divergences after tuning. Increase `target_accept` or reparameterize.
The rhat statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling.
The estimated number of effective samples is smaller than 200 for some parameters.
In [68]:
az.summary(trace_1_1, var_names=['~phi'], round_to=2)
Out[68]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
bA -0.48 0.05 -0.58 -0.38 0.00 0.00 771.13 771.13 780.23 2085.26 1.00
bC -0.34 0.07 -0.47 -0.22 0.00 0.00 978.08 903.50 979.65 1478.39 1.01
bI -0.29 0.06 -0.39 -0.18 0.00 0.00 454.55 454.45 456.69 1045.26 1.02
bIA -0.44 0.08 -0.58 -0.29 0.00 0.00 286.99 286.99 291.60 1977.79 1.01
bIC -1.25 0.10 -1.44 -1.07 0.01 0.00 217.78 193.75 206.35 800.14 1.03
bAge -0.10 0.02 -0.14 -0.06 0.00 0.00 50.05 50.05 52.88 120.61 1.05
bE 0.20 0.14 -0.19 0.40 0.04 0.03 15.36 15.36 32.36 14.87 1.08
cutpoints[0] -2.51 0.11 -2.77 -2.30 0.03 0.02 19.17 18.40 35.76 15.66 1.08
cutpoints[1] -1.81 0.11 -2.05 -1.60 0.02 0.02 20.72 19.53 41.30 16.20 1.07
cutpoints[2] -1.21 0.11 -1.48 -1.02 0.03 0.02 18.86 17.44 36.59 15.04 1.08
cutpoints[3] -0.17 0.11 -0.43 0.01 0.02 0.02 18.86 14.67 36.18 15.16 1.08
cutpoints[4] 0.50 0.11 0.23 0.68 0.02 0.02 18.87 18.87 36.24 15.24 1.08
cutpoints[5] 1.41 0.11 1.14 1.59 0.02 0.02 19.51 19.51 35.21 16.93 1.08
delta[0] 0.11 0.08 0.01 0.25 0.01 0.01 104.93 86.95 145.19 172.77 1.03
delta[1] 0.12 0.08 0.00 0.26 0.00 0.00 391.81 391.81 387.69 1471.66 1.02
delta[2] 0.09 0.06 0.00 0.20 0.00 0.00 330.80 330.80 381.00 1112.41 1.01
delta[3] 0.08 0.07 0.00 0.20 0.01 0.01 29.57 29.57 47.91 41.53 1.06
delta[4] 0.42 0.16 0.01 0.63 0.03 0.02 25.17 25.17 44.21 19.11 1.06
delta[5] 0.08 0.06 0.00 0.19 0.00 0.00 847.97 847.97 621.38 708.70 1.00
delta[6] 0.09 0.06 0.01 0.20 0.00 0.00 958.54 958.54 831.50 569.10 1.01
In [46]:
az.plot_forest(trace_1_1, var_names=['~cutpoints', '~delta', '~phi'], combined=False)
Out[46]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fa7f5499dd0>],
      dtype=object)

Note how one of the chains gets a lot more variance for bE. When I use combined=True the values are slightly different from the summary, because the forest plot shows the median instead of the mean.

In [66]:
trace_1_1.posterior.bE.values.mean(axis=1)
Out[66]:
array([0.11396679, 0.22768285, 0.23206889, 0.23841915])
In [67]:
np.median(trace_1_1.posterior.bE.values, axis=1)
Out[67]:
array([0.19272587, 0.24142599, 0.23902027, 0.24231618])
In [70]:
_ =az.plot_trace(trace_1_1, var_names=['~cutpoints', '~delta', '~phi'])

You can see that there is an issue with the blue chain.

1.1.1 Fixing the Trace

In [82]:
with pm.Model() as model_1_1_1:
    
    # cutpoints
    cutpoints = pm.Normal('cutpoints', mu=0, sd=1.5,
                          transform=pm.distributions.transforms.ordered,
                          shape=NUM_RESP_CATEGORIES-1,
                          testval = np.array(range(6))-2.5)
    
    # Create shared variables
    action_shared = pm.Data('action', df_data['action'])
    intention_shared = pm.Data('intention', df_data['intention'])
    contact_shared = pm.Data('contact', df_data['contact'])
    age_shared = pm.Data('age', df_data['age_std']) # need to use standardized values
    edu_shared = pm.Data('edu', df_data['edu_ord']) # this is an ordered variable
    
    # the response needs to start at 0 (see the comment above)
    response_shared = pm.Data('response_obs', df_data['response'] - 1)
    
    # Parameters
    bA = pm.Normal('bA', mu=0, sd=0.5)
    bC = pm.Normal('bC', mu=0, sd=0.5)
    bI = pm.Normal('bI', mu=0, sd=0.5)
    bIA = pm.Normal('bIA', mu=0, sd=0.5)
    bIC = pm.Normal('bIC', mu=0, sd=0.5)
    bAge = pm.Normal('bAge', mu=0, sd=0.5)
    bE = pm.Normal('bE', mu=0, sd=0.5)
    
    delta = pm.Dirichlet('delta', np.full(NUM_EDU_CATEGORIES-1,2))
    delta_j = pm.math.concatenate((tt.zeros(1), delta))
    delta_j_cumsum = tt.cumsum(delta_j)
    
    # regression
    BI = bI + bIA*action_shared + bIC*contact_shared
    phi = pm.Deterministic('phi', bA*action_shared + BI*intention_shared + bC*contact_shared \
                           + bAge*age_shared + bE*delta_j_cumsum[edu_shared])
    
    _ = pm.OrderedLogistic('response', eta=phi, cutpoints=cutpoints, observed=response_shared)
    
    # prior_predicitive_1_1_1 = pm.sample_prior_predictive() # Throws an error. Why?
    trace_1_1_1 = pm.sample(2000, tune=2000, target_accept=0.9, return_inferencedata=True, random_seed=RANDOM_SEED)
    # posterior_predictive_1_1_1 = pm.sample_posterior_predictive(trace_1_1_1)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [delta, bE, bAge, bIC, bIA, bI, bC, bA, cutpoints]
100.00% [16000/16000 11:35<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 696 seconds.
The number of effective samples is smaller than 25% for some parameters.
In [83]:
az.summary(trace_1_1_1, var_names=['~phi'], round_to=2)
Out[83]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
bA -0.48 0.05 -0.57 -0.37 0.0 0.0 4106.46 4055.97 4107.05 5268.54 1.0
bC -0.35 0.07 -0.47 -0.22 0.0 0.0 3780.35 3776.66 3778.30 5552.27 1.0
bI -0.29 0.06 -0.40 -0.19 0.0 0.0 3604.16 3591.95 3602.78 5127.27 1.0
bIA -0.43 0.08 -0.58 -0.29 0.0 0.0 4152.09 4152.09 4150.10 5507.70 1.0
bIC -1.24 0.10 -1.41 -1.05 0.0 0.0 4111.82 4111.82 4121.04 5050.83 1.0
bAge -0.10 0.02 -0.14 -0.06 0.0 0.0 3783.66 3783.66 3971.15 2962.59 1.0
bE 0.23 0.11 0.03 0.42 0.0 0.0 1104.92 1104.92 1938.18 863.77 1.0
cutpoints[0] -2.49 0.10 -2.64 -2.31 0.0 0.0 1313.45 1262.92 1826.30 974.40 1.0
cutpoints[1] -1.79 0.09 -1.95 -1.62 0.0 0.0 1298.09 1232.06 1834.28 994.80 1.0
cutpoints[2] -1.20 0.09 -1.36 -1.03 0.0 0.0 1278.87 1185.60 1824.93 973.65 1.0
cutpoints[3] -0.16 0.09 -0.31 0.02 0.0 0.0 1252.48 930.11 1792.60 987.62 1.0
cutpoints[4] 0.52 0.09 0.36 0.69 0.0 0.0 1259.68 1259.68 1823.54 972.56 1.0
cutpoints[5] 1.42 0.09 1.25 1.58 0.0 0.0 1280.42 1280.42 1844.09 1015.78 1.0
delta[0] 0.11 0.07 0.01 0.24 0.0 0.0 4337.00 3873.99 4544.76 4508.44 1.0
delta[1] 0.12 0.08 0.00 0.25 0.0 0.0 7605.46 7605.46 6400.41 3943.33 1.0
delta[2] 0.09 0.06 0.00 0.20 0.0 0.0 4231.40 3158.80 5354.94 3987.24 1.0
delta[3] 0.07 0.05 0.00 0.16 0.0 0.0 2933.86 2871.39 3811.59 2876.68 1.0
delta[4] 0.44 0.14 0.12 0.69 0.0 0.0 1588.17 1588.17 2208.70 885.47 1.0
delta[5] 0.08 0.06 0.00 0.19 0.0 0.0 7029.29 5671.92 7867.68 5637.64 1.0
delta[6] 0.10 0.06 0.01 0.22 0.0 0.0 7346.24 6125.50 7751.13 5745.39 1.0
In [84]:
az.plot_forest(trace_1_1_1, var_names=['~cutpoints', '~delta', '~phi'], combined=False)
Out[84]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fa7e1648990>],
      dtype=object)
In [85]:
_ =az.plot_trace(trace_1_1_1, var_names=['~cutpoints', '~delta', '~phi'])

1.2 Model Without Age

In [47]:
with pm.Model() as model_1_2:
    
    # cutpoints
    cutpoints = pm.Normal('cutpoints', mu=0, sd=1.5,
                          transform=pm.distributions.transforms.ordered,
                          shape=NUM_RESP_CATEGORIES-1,
                          testval = log_cum_odds_k[:-1])
    
    # Create shared variables
    action_shared = pm.Data('action', df_data['action'])
    intention_shared = pm.Data('intention', df_data['intention'])
    contact_shared = pm.Data('contact', df_data['contact'])
    edu_shared = pm.Data('edu', df_data['edu_ord']) # this is an ordered variable
    
    # the response needs to start at 0 (see the comment above)
    response_shared = pm.Data('response_obs', df_data['response'] - 1)
    
    # Parameters
    bA = pm.Normal('bA', mu=0, sd=0.5)
    bC = pm.Normal('bC', mu=0, sd=0.5)
    bI = pm.Normal('bI', mu=0, sd=0.5)
    bIA = pm.Normal('bIA', mu=0, sd=0.5)
    bIC = pm.Normal('bIC', mu=0, sd=0.5)
    bE = pm.Normal('bE', mu=0, sd=0.5)
    
    delta = pm.Dirichlet('delta', np.full(NUM_EDU_CATEGORIES-1,2))
    delta_j = pm.math.concatenate((tt.zeros(1), delta))
    delta_j_cumsum = tt.cumsum(delta_j)

    
    # regression
    BI = bI + bIA*action_shared + bIC*contact_shared
    phi = pm.Deterministic('phi', bA*action_shared + BI*intention_shared + bC*contact_shared \
                           + bE*delta_j_cumsum[edu_shared])

    
    _ = pm.OrderedLogistic('response', eta=phi, cutpoints=cutpoints, observed=response_shared)
    # prior_predicitive_1_2 = pm.sample_prior_predictive() # Throws an error. Why?
    trace_1_2 = pm.sample(2000, tune=2000, return_inferencedata=True)
    # posterior_predictive_1_2 = pm.sample_posterior_predictive(trace_1_2)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [delta, bE, bIC, bIA, bI, bC, bA, cutpoints]
100.00% [16000/16000 12:36<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 757 seconds.
In [48]:
az.summary(trace_1_2, var_names=['~phi'], round_to=2)
Out[48]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
bA -0.47 0.05 -0.57 -0.37 0.0 0.0 6184.22 6126.99 6187.96 6412.50 1.0
bC -0.34 0.07 -0.47 -0.21 0.0 0.0 5627.47 5627.47 5629.70 5419.50 1.0
bI -0.29 0.06 -0.40 -0.18 0.0 0.0 5158.76 5151.27 5157.59 5722.33 1.0
bIA -0.44 0.08 -0.58 -0.29 0.0 0.0 5544.54 5502.57 5546.75 5775.01 1.0
bIC -1.24 0.10 -1.42 -1.06 0.0 0.0 5376.15 5376.15 5381.56 5684.87 1.0
bE -0.32 0.16 -0.63 -0.03 0.0 0.0 3605.41 3519.55 3664.98 3280.28 1.0
cutpoints[0] -2.88 0.14 -3.16 -2.62 0.0 0.0 3551.56 3537.06 3553.41 3370.59 1.0
cutpoints[1] -2.18 0.14 -2.45 -1.91 0.0 0.0 3556.32 3525.47 3576.31 3471.93 1.0
cutpoints[2] -1.59 0.14 -1.86 -1.33 0.0 0.0 3515.46 3515.46 3527.86 3462.85 1.0
cutpoints[3] -0.55 0.14 -0.82 -0.29 0.0 0.0 3534.85 3448.30 3553.30 3528.04 1.0
cutpoints[4] 0.12 0.14 -0.16 0.37 0.0 0.0 3518.34 3381.05 3554.89 3296.68 1.0
cutpoints[5] 1.02 0.14 0.75 1.29 0.0 0.0 3592.33 3592.33 3610.39 3726.53 1.0
delta[0] 0.22 0.13 0.01 0.46 0.0 0.0 5461.09 4881.12 5449.17 4623.12 1.0
delta[1] 0.14 0.09 0.01 0.30 0.0 0.0 10052.24 8341.26 9341.78 4914.58 1.0
delta[2] 0.19 0.11 0.02 0.39 0.0 0.0 8180.97 7826.57 7730.44 5130.48 1.0
delta[3] 0.18 0.10 0.01 0.35 0.0 0.0 7882.42 7386.71 7265.60 4842.80 1.0
delta[4] 0.04 0.05 0.00 0.11 0.0 0.0 3261.18 3261.18 5531.36 3843.33 1.0
delta[5] 0.10 0.07 0.00 0.22 0.0 0.0 8338.50 7115.22 8546.36 6176.22 1.0
delta[6] 0.12 0.08 0.01 0.26 0.0 0.0 8226.55 7007.36 8182.67 5300.96 1.0
In [49]:
az.plot_forest(trace_1_2, var_names=['~cutpoints', '~delta', '~phi'], combined=False)
Out[49]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fa7e6c48550>],
      dtype=object)

In this case the effect of Education is negative.

In [71]:
_ =az.plot_trace(trace_1_2, var_names=['~cutpoints', '~delta', '~phi'])

2

In [87]:
 with pm.Model() as model_2_1:
    
    # cutpoints
    cutpoints = pm.Normal('cutpoints', mu=0, sd=1.5,
                          transform=pm.distributions.transforms.ordered,
                          shape=NUM_RESP_CATEGORIES-1,
                          testval = np.array(range(6))-2.5)
    
    # Create shared variables
    action_shared = pm.Data('action', df_data['action'])
    intention_shared = pm.Data('intention', df_data['intention'])
    contact_shared = pm.Data('contact', df_data['contact'])
    age_shared = pm.Data('age', df_data['age_std']) # need to use standardized values
    edu_shared = pm.Data('edu', df_data['edu_ord']) # this is an ordered variable
    male_shared = pm.Data('male', df_data['male'])
    
    # the response needs to start at 0 (see the comment above)
    response_shared = pm.Data('response_obs', df_data['response'] - 1)
    
    # Parameters
    bA = pm.Normal('bA', mu=0, sd=0.5)
    bC = pm.Normal('bC', mu=0, sd=0.5)
    bI = pm.Normal('bI', mu=0, sd=0.5)
    bIA = pm.Normal('bIA', mu=0, sd=0.5)
    bIC = pm.Normal('bIC', mu=0, sd=0.5)
    bAge = pm.Normal('bAge', mu=0, sd=0.5)
    bE = pm.Normal('bE', mu=0, sd=0.5)
    bM = pm.Normal('bG', mu=0, sd=0.5)
    
    delta = pm.Dirichlet('delta', np.full(NUM_EDU_CATEGORIES-1,2))
    delta_j = pm.math.concatenate((tt.zeros(1), delta))
    delta_j_cumsum = tt.cumsum(delta_j)
    
    # regression
    BI = bI + bIA*action_shared + bIC*contact_shared
    phi =  bA*action_shared + BI*intention_shared + bC*contact_shared \
           + bAge*age_shared + bE*delta_j_cumsum[edu_shared] \
           + bM*male_shared
    
    _ = pm.OrderedLogistic('response', eta=phi, cutpoints=cutpoints, observed=response_shared)
    
    # prior_predicitive_2_1 = pm.sample_prior_predictive() # Throws an error. Why?
    trace_2_1 = pm.sample(2000, tune=2000, target_accept=0.9, return_inferencedata=True, random_seed=RANDOM_SEED)
    # posterior_predictive_2_1 = pm.sample_posterior_predictive(trace_2_1)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [delta, bG, bE, bAge, bIC, bIA, bI, bC, bA, cutpoints]
100.00% [16000/16000 15:08<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 909 seconds.
In [88]:
az.summary(trace_2_1, round_to=2)
Out[88]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
bA -0.48 0.05 -0.58 -0.38 0.0 0.0 5122.81 5121.49 5110.64 6234.01 1.0
bC -0.35 0.07 -0.48 -0.22 0.0 0.0 5344.40 5344.40 5339.43 6024.46 1.0
bI -0.29 0.06 -0.41 -0.19 0.0 0.0 4402.27 4402.27 4400.63 5797.91 1.0
bIA -0.44 0.08 -0.59 -0.30 0.0 0.0 5086.32 5050.46 5083.93 6056.31 1.0
bIC -1.26 0.10 -1.44 -1.08 0.0 0.0 4817.28 4764.73 4820.85 5818.96 1.0
bAge -0.07 0.02 -0.11 -0.03 0.0 0.0 4758.49 4758.49 4756.75 6111.67 1.0
bE -0.00 0.17 -0.31 0.29 0.0 0.0 2671.21 2671.21 2779.01 4963.96 1.0
bG 0.57 0.04 0.50 0.63 0.0 0.0 13364.02 13229.48 13365.63 5817.07 1.0
cutpoints[0] -2.37 0.13 -2.61 -2.13 0.0 0.0 2865.04 2865.04 2874.08 4841.53 1.0
cutpoints[1] -1.67 0.13 -1.92 -1.45 0.0 0.0 2842.87 2842.87 2853.37 5125.07 1.0
cutpoints[2] -1.07 0.13 -1.31 -0.84 0.0 0.0 2840.14 2840.14 2848.93 5218.08 1.0
cutpoints[3] -0.01 0.13 -0.25 0.22 0.0 0.0 2802.66 2802.66 2810.38 5294.52 1.0
cutpoints[4] 0.68 0.13 0.44 0.91 0.0 0.0 2793.37 2768.62 2809.44 5039.50 1.0
cutpoints[5] 1.61 0.13 1.36 1.84 0.0 0.0 2883.90 2864.87 2894.61 5159.00 1.0
delta[0] 0.15 0.10 0.01 0.33 0.0 0.0 9423.58 7246.25 9334.35 5721.34 1.0
delta[1] 0.14 0.09 0.01 0.30 0.0 0.0 11051.47 9021.48 8591.90 4454.57 1.0
delta[2] 0.14 0.09 0.01 0.31 0.0 0.0 7995.03 6727.82 7768.98 4904.78 1.0
delta[3] 0.14 0.10 0.00 0.32 0.0 0.0 5279.20 5279.20 5047.90 5460.66 1.0
delta[4] 0.18 0.15 0.00 0.46 0.0 0.0 2819.73 2819.73 2974.00 6197.54 1.0
delta[5] 0.12 0.08 0.00 0.27 0.0 0.0 8617.98 6948.48 8847.99 6513.00 1.0
delta[6] 0.13 0.08 0.01 0.27 0.0 0.0 11593.75 9330.17 11563.11 5755.58 1.0

Now the effect of Education is vanishing. Maybe there is an "artifical" relationship between Gender and Education based on how the data was obtained.

In [89]:
az.plot_forest(trace_2_1, var_names=['~cutpoints', '~delta'], combined=False)
Out[89]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fa7e793d6d0>],
      dtype=object)
In [90]:
_ = az.plot_trace(trace_2_1, var_names=['~cutpoints', '~delta'])
In [ ]: